In [1]:
# Library 

import numpy as np 
import pandas as pd 

# Plotly libraries 

import plotly.express as px 
import plotly.graph_objects as go 
import plotly.figure_factory as ff 
from plotly.colors import n_colors 
from plotly.subplots import make_subplots 

# Minmax scaler 

from sklearn.preprocessing import MinMaxScaler 
In [207]:
covid = pd.read_csv('novel-corona-virus-2019-dataset/covid_19_data.csv')
covid_line=pd.read_csv('novel-corona-virus-2019-dataset/COVID19_line_list_data.csv')
titanic = pd.read_csv('titanic/train.csv')
us_counties = pd.read_csv('us-counties-covid-19-dataset/us-counties.csv')
house=pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
netflix=pd.read_csv("netflix-shows/netflix_titles.csv")
world=pd.read_csv('world-university-rankings/cwurData.csv')
google=pd.read_csv("google-play-store-apps/googleplaystore.csv")
user_achieve=pd.read_csv('meta-kaggle/UserAchievements.csv')
user=pd.read_csv('meta-kaggle/Users.csv')
campus=pd.read_csv('factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
In [208]:
covid.head(3)
Out[208]:
SNo ObservationDate Province/State Country/Region Last Update Confirmed Deaths Recovered
0 1 01/22/2020 Anhui Mainland China 1/22/2020 17:00 1.0 0.0 0.0
1 2 01/22/2020 Beijing Mainland China 1/22/2020 17:00 14.0 0.0 0.0
2 3 01/22/2020 Chongqing Mainland China 1/22/2020 17:00 6.0 0.0 0.0
In [209]:
covid_line.head(3)
Out[209]:
id case_in_country reporting date Unnamed: 3 summary location country gender age symptom_onset ... recovered symptom source link Unnamed: 21 Unnamed: 22 Unnamed: 23 Unnamed: 24 Unnamed: 25 Unnamed: 26
0 1 NaN 1/20/2020 NaN First confirmed imported COVID-19 pneumonia pa... Shenzhen, Guangdong China male 66.0 01/03/20 ... 0 NaN Shenzhen Municipal Health Commission http://wjw.sz.gov.cn/wzx/202001/t20200120_1898... NaN NaN NaN NaN NaN NaN
1 2 NaN 1/20/2020 NaN First confirmed imported COVID-19 pneumonia pa... Shanghai China female 56.0 1/15/2020 ... 0 NaN Official Weibo of Shanghai Municipal Health Co... https://www.weibo.com/2372649470/IqogQhgfa?fro... NaN NaN NaN NaN NaN NaN
2 3 NaN 1/21/2020 NaN First confirmed imported cases in Zhejiang: pa... Zhejiang China male 46.0 01/04/20 ... 0 NaN Health Commission of Zhejiang Province http://www.zjwjw.gov.cn/art/2020/1/21/art_1202... NaN NaN NaN NaN NaN NaN

3 rows × 27 columns

In [211]:
titanic.head(3)
Out[211]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
In [212]:
us_counties.head(3)
Out[212]:
date county state fips cases deaths
0 2020-01-21 Snohomish Washington 53061.0 1 0
1 2020-01-22 Snohomish Washington 53061.0 1 0
2 2020-01-23 Snohomish Washington 53061.0 1 0
In [213]:
house.head(3)
Out[213]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500

3 rows × 81 columns

In [214]:
netflix.head(3)
Out[214]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
1 80117401 Movie Jandino: Whatever it Takes NaN Jandino Asporaat United Kingdom September 9, 2016 2016 TV-MA 94 min Stand-Up Comedy Jandino Asporaat riffs on the challenges of ra...
2 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States September 8, 2018 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob...
In [215]:
world.head(3)
Out[215]:
world_rank institution country national_rank quality_of_education alumni_employment quality_of_faculty publications influence citations broad_impact patents score year
0 1 Harvard University USA 1 7 9 1 1 1 1 NaN 5 100.00 2012
1 2 Massachusetts Institute of Technology USA 2 9 17 3 12 4 4 NaN 1 91.67 2012
2 3 Stanford University USA 3 17 11 5 4 2 2 NaN 15 89.50 2012
In [216]:
google.head(3)
Out[216]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone Art & Design August 1, 2018 1.2.4 4.0.3 and up
In [217]:
user_achieve.head(3)
Out[217]:
Id UserId AchievementType Tier TierAchievementDate Points CurrentRanking HighestRanking TotalGold TotalSilver TotalBronze
0 3739822 1 Discussion 1 11/06/2019 0 NaN 3.0 0 0 14
1 3916402 1 Competitions 1 11/06/2019 0 NaN NaN 0 0 0
2 3739823 368 Competitions 1 07/15/2016 892 NaN 75.0 0 0 0
In [218]:
user.head(3)
Out[218]:
Id UserName DisplayName RegisterDate PerformanceTier
0 1 kaggleteam Kaggle Team 03/24/2011 5
1 368 antgoldbloom Anthony Goldbloom 01/20/2010 5
2 381 iguyon Isabelle 01/29/2010 2
In [219]:
campus[:3]
Out[219]:
sl_no gender ssc_p ssc_b hsc_p hsc_b hsc_s degree_p degree_t workex etest_p specialisation mba_p status salary
0 1 M 67.00 Others 91.00 Others Commerce 58.00 Sci&Tech No 55.0 Mkt&HR 58.80 Placed 270000.0
1 2 M 79.33 Central 78.33 Others Science 77.48 Sci&Tech Yes 86.5 Mkt&Fin 66.28 Placed 200000.0
2 3 M 65.00 Central 68.00 Central Arts 64.00 Comm&Mgmt No 75.0 Mkt&Fin 57.80 Placed 250000.0

Plotly express: Functions that can create entire figures at once. It is the recommended starting point for creating most common figures.

Graph objects: The figures created, manipulated and rendered by the plotly Python library are represented by tree-like data structures which are automatically serialized to JSON for redenering the Plotly.jsJavaScript library

Basic elements in layout:

        - xaxis_title: Plot x axis label
        - yaxis_title: Plot y axis label
        - title: Plot title
        - title_font_size: Plot title font size
        - height: Height of chart
        - width: Width of chart
        - show_legend: False- Disable legend
        - xaxis_type/ yaxis_type: Type of X/Y axis
        - xaxis_showgrid/ yaxis_showgrid: Display grids or not
        - gridcolor: Color of grid 
        - gridwidth: Width of grid

Basic elements in axes:

    - ticks: 'inside'/'outside' - Zxes tickmark
    - nticks: Number of ticks
    - tick0: First tick position
    - tickwidth: Width of ticks
    - tickcolor: Color of ticks
    - ticklen: Length of ticks
    - showticklabels: Display tick labels or not
    - tickangle: Angle of tick labels
    - tickfont: Font of tick label ex: dict(family = 'Rockwell', color = 'crimson', size = 14)
    - tickprefix: Prefix of tick labels
    - showline: Line of chart outline
    - linewidth: Line width of chart outline, label = 2
    - linecolor: Line color of chart outline, 'black'
    - mirror: Opposite side of plotting area
    - range: Range limit of axis

Basic Scatter Plot

Purpose: Relationship between numerical values

Question: How much dependence between SalesPrices and Area?

In [3]:
fig = px.scatter(house, x ='LotArea', y ='SalePrice')
fig.update_layout(title = ' Sales Price vs Area', xaxis_title = 'Area', yaxis_title = 'Price')
fig.show()

Scatter plot category

Purpose: Relationship between numerical values with a categorical field.

Question: How much dependency between SalesPrices and Area with Shape of plot?

In [4]:
fig = px.scatter(house, x ='LotArea', y ='SalePrice',
                color = 'LotShape')
fig.update_layout(title = 'Sales Price vs Area with Shape', xaxis_title ='Area', yaxis_title = 'Price')
fig.show()

Scatter plot - Color and Size

Purpose: Relationship between numerical values with addition of categorization by a field and increasing size of datapoint by another numericalfield.

Question: How much dependency between Quality of Education and Students Score for different countries based on number of students?

In [5]:
fig = px.scatter(world, x='quality_of_education', y ='score', color= 'country', size ='citations')
fig.update_layout(title ='Quality of Education vs Score with Country and Students', xaxis_title = 'Quality of Education', yaxis_title = 'Score')
fig.show()

Scatter plot - Color dimension

Purpose: Relationship between numerical values exposing most data points by color gradient.

Question: How much relationship between Ratings and Reviews with most occured ratings?

In [6]:
google['Reviews'] = google['Reviews'].str.replace(r'\D', '')
google['Reviews'] = pd.to_numeric(google['Reviews'])
In [7]:
fig = go.Figure(data = go.Scatter(x=google['Rating'],
                                  y = google['Reviews'],
                                  mode = 'markers',
                                  marker_color = google['Reviews'])
                                 )

fig.update_xaxes(range = [0,6])
fig.update_layout(title = 'Playstore Apps - Reviews vs Ratings with gradient', xaxis_title = 'Ratings', yaxis_title = 'Reviews')

fig.show()

Basic Line Plot

Purpose: Relationship between variables with respective to time

Question: How many COVID deaths were observed over time?

In [8]:
total_confirmed = covid[['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
In [9]:
fig = go.Figure(data =go.Scatter(x=total_confirmed['ObservationDate'], y = total_confirmed['Deaths'], mode = 'lines'))

fig.update_layout(title = 'Number of COVID cases over time', xaxis_title = 'Date', yaxis_title ='Number of cases')

fig.show()

Types of line plot

Purpose: Relationship between variables with respective to time.

Question: How many COVID deaths were observed over time for different countries? Different types of line plot for different countries? Different types of line plot for different countries (Dash, Dashdot, Dot)

In [10]:
covid_can = covid[covid['Country/Region'] == 'Canada'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()

covid_rus = covid[covid['Country/Region']=='Russia'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()

covid_uk = covid[covid['Country/Region'] =='UK'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
In [11]:
fig = go.Figure()

fig.add_trace(go.Scatter(x= covid_can['ObservationDate'], y = covid_can['Deaths'], name = 'Cannada-Dot', line = dict(color = 'royalblue', width = 4, dash = 'dot')))

fig.add_trace(go.Scatter(x= covid_rus['ObservationDate'], y = covid_rus['Deaths'], name = 'Russia-Dashdot', line = dict(color = 'green', width =4, dash = 'dashdot')))

fig.add_trace(go.Scatter(x= covid_uk['ObservationDate'], y = covid_uk['Deaths'], name = 'UK-Dash', line = dict(color = 'brown', width =4, dash = 'dash')))

fig.update_layout(title = 'Number of COVID cases over time for different countries', xaxis_title = 'Date', yaxis_title = 'Number of cases')

fig.show()

Simple Bubble Plot

Purpose: Displays quantitative representation highlighting the most occured category with the size of bubble.

Question: How many people travlled in each class of titanic? Highlight the most used class

In [12]:
pclass = titanic['Pclass'].value_counts().to_frame().reset_index().rename(columns = {'index' : 'Pclass', 'Pclass' : 'Count'})
In [13]:
fig = go.Figure(data = [go.Scatter(x = pclass['Pclass'], y = pclass['Count'], mode = 'markers', marker = dict(size = pclass['Count']*0.3))])

fig.update_layout(title = 'People travelled in each class of titanic', xaxis_title = 'Class', yaxis_title = 'Number of People')
fig.show()

Bubble Plot with Color gradient

Purpose: Displays quantitative representation highlighting the most occured category with the color gradient of bubble

Question: How many of each aged category people travelled in titanic? Highlight the most occured age.

In [14]:
titanic=titanic.dropna()
titanic['age_category']=np.where((titanic['Age']<19),"below 19",
                                 np.where((titanic['Age']>18)&(titanic['Age']<=30),"19-30",
                                    np.where((titanic['Age']>30)&(titanic['Age']<=50),"31-50",
                                                np.where(titanic['Age']>50,"Above 50","NULL"))))

age = titanic['age_category'].value_counts().to_frame().reset_index().rename(columns = {'index' :'age_category', 'age_category':'Count'})
In [15]:
fig = go.Figure(data = [go.Scatter(
x= age['age_category'], y=age['Count'], mode = 'markers', 
marker = dict(
color = age['Count'],
size = age['Count'],
showscale = True)
)])

fig.update_layout(title = 'Different Age People in Titanic', xaxis_title ='Age Category', yaxis_title = 'Number of People')

fig.show()

Simple Bar Chart

Purpose: Displays quantitative representation of variable

Question: How many universities in each country have good score? (filtered for universities with score greater than 60)

In [16]:
top_countries = world[world['score'] > 60]['country'].value_counts().reset_index().rename(columns = {'index' : 'country', 'country':'count'})
In [17]:
fig = go.Figure(data = [go.Bar(
x = top_countries['country'], y = top_countries['count'])])

fig.update_layout(title_text = 'Top Countries with number of Universities score greater than 60', xaxis_title = 'Country', yaxis_title = 'Number of Universities')

fig.show()

Bar Chart - Fradient & Text Position

Purpose: Displays quantitative representation of a variable highlighting the most counts with color gradient and text position for all bars

Question: In which genre does most of google playstore apps fall? Highlight from top count to low count

In [18]:
apps = google['Genres'].value_counts()[:10].to_frame().reset_index().rename(columns = {'index' : 'Genres', 'Genres':'Count'})
apps.head(3)
Out[18]:
Genres Count
0 Tools 842
1 Entertainment 623
2 Education 549
In [19]:
fig = go.Figure(go.Bar(
x=apps['Genres'], y=apps['Count'],
marker = {'color':apps['Count'],
         'colorscale' : 'Viridis'},
         text = apps['Count'],
        textposition = 'outside'
))

fig.update_layout(title_text = 'Top Genres Google Playstore Apps', xaxis_title = 'App Genres', yaxis_title = 'Number of Apps')

fig.show()

Bar Chart - Stack/ Group

Purpose: Displays quantitative representation of a variable grouping/stacking the bars

Question: How many shows/movies were released in Netflix by India & United States over past 5 years? (Grouping or Stacking countries)

In [20]:
top_release_india=netflix[(netflix['country']=='India')&
                    ((netflix['release_year']==2015)|(netflix['release_year']==2016)|(netflix['release_year']==2017)|(netflix['release_year']==2018)|
                    (netflix['release_year']==2019)|(netflix['release_year']==2020))]['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})

top_release_us = netflix[(netflix['country']=='United States')&
                    ((netflix['release_year']==2015)|(netflix['release_year']==2016)|(netflix['release_year']==2017)|(netflix['release_year']==2018)|
                    (netflix['release_year']==2019)|(netflix['release_year']==2020))]['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
In [21]:
fig = go.Figure()

fig.add_trace(go.Bar(x= top_release_india['release_year'], y = top_release_india['count'], name = 'India', marker_color = 'blue'))

fig.add_trace(go.Bar(x=top_release_us['release_year'], y = top_release_us['count'], name = 'United States', marker_color = 'violet'))

fig.update_layout(title_text = 'Netflix shows by India/US over past 5 years', xaxis_title = 'Year', yaxis_title = 'Number of Shows',
                 barmode = 'stack')

fig.show()

Facet Bar Chart

Purpose: Displays collected view of different categorical features with respect to single numerical variable.

Question: How much is the sum of fare for each gender passengers in each class and their embarked? (Facet variables - Survived, Pclass, Grouped Bar - Embarked, Single Numerical (Y axis) - False)

In [22]:
facet_titanic=titanic[['Sex','Survived','Embarked','Pclass','Fare']].groupby(['Sex','Survived','Embarked','Pclass']).agg('sum').reset_index()

facet_titanic.head(2)
Out[22]:
Sex Survived Embarked Pclass Fare
0 female 0 C 1 28.7125
1 female 0 S 1 303.1000
In [23]:
fig = px.bar(facet_titanic, x="Sex", y="Fare",color="Embarked",barmode="group",
             facet_row="Survived", facet_col="Pclass",
             )
fig.update_layout(title_text='Facet view of Titanic passengers Fare with respect to Age,Class,Embarked')

fig.show()

Horizontal Bar Chart

Purpose: Displays quantitative representation of a variable in horizontal manner.

Question: How many playstore apps fall in each categories?

In [24]:
app_category = google['Category'].value_counts()[:15].reset_index().rename(columns = {'index' : 'Category', 'Category':'Count'}).sort_values('Count', ascending = 'False')

app_category[:3]
Out[24]:
Category Count
14 NEWS_AND_MAGAZINES 283
13 SOCIAL 295
12 PHOTOGRAPHY 335
In [25]:
fig = go.Figure(go.Bar(y=app_category['Category'], x = app_category['Count'], orientation = 'h'))

fig.update_layout(title_text ='Top 15 Google Playstore App Categories', xaxis_title = 'Count', yaxis_title = 'Number of Apps')
fig.show()

Dot Chart

Purpose: Displays quantitative representation of a variable in a horizontal manner with dots as categorical feature.

Question: What is average score for different countries i n2014 & 2015?

In [26]:
grouped_df = world[['country', 'year', 'score']].groupby(['country', 'year']).agg('mean').reset_index()

grouped_df_2014 = grouped_df[grouped_df['year'] == 2014][['country', 'score']][:20]

grouped_df_2015=grouped_df[grouped_df['year']==2015][['country', 'score']][:20]

grouped_df_2015[:3]
Out[26]:
country score
1 Argentina 44.593333
5 Australia 45.812222
7 Austria 44.987500
In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    y = grouped_df_2014['country'],
    x = grouped_df_2014['score'],
    marker = dict(color = 'red', size =12),
    mode = 'markers',
    name = '2014'))

fig.add_trace(go.Scatter(
    y = grouped_df_2015['country'],
    x = grouped_df_2015['score'],
    marker = dict(color = 'blue', size = 12),
    mode = 'markers',
    name = '2015'))

fig.update_layout(title = 'Average scores for years - 2014&2015', xaxis_title = 'Score', yaxis_title = 'Country')

fig.show()

Basic Pie

Purpose: Displays quantitative representation in pie with label and textinfo

Question: What is split distribution count of netflix program types?

In [28]:
net_category = netflix['type'].value_counts().to_frame().reset_index().rename(columns = {'index':'type', 'type':'count'})
net_category.head(2)
Out[28]:
type count
0 Movie 4265
1 TV Show 1969
In [29]:
fig = go.Figure(go.Pie(labels = net_category['type'], values = net_category['count']))

fig.update_traces(hoverinfo = 'label+percent', textinfo = 'value+percent', textfont_size = 15, insidetextorientation = 'radial')

fig.update_layout(title = 'Netflix Show Types', title_x = 0.5)
fig.show()

Pie with custom colors

Purpose: Displays quantitaive representation in pie with custom colors to labels

Question: What is split distribution count of titanic age categories? Highlight each category with different color.

In [30]:
titanic_age = titanic['age_category'].value_counts().to_frame().reset_index().rename(columns = {'index' : 'age_category', 'age_category':'count'})

titanic_age
Out[30]:
age_category count
0 31-50 82
1 19-30 46
2 Above 50 32
3 below 19 23
In [31]:
colors = ['red', 'green', 'yellow', 'blue']

fig = go.Figure(go.Pie(labels = titanic_age['age_category'], values = titanic_age['count']))

fig.update_traces(hoverinfo='label+percent', 
                  textinfo = 'percent+label', textfont_size=15,
                  marker = dict(colors = colors, line = dict(color='#000000', width =2)))

fig.update_layout(title = 'Titanic Age Categories', title_x = 0.5)

fig.show()

Donut

Purpose: Displays quantitative representation in pie with donut shape.

Question: What is distribution count of google playstore apps content rating?

In [32]:
content = google['Content Rating'].value_counts().to_frame().reset_index().rename(columns = {'index': 'Content Rating', 'Content Rating':'count'})
In [33]:
fig = go.Figure(go.Pie(labels = content['Content Rating'], values = content['count'], hole = 0.9))

fig.update_traces(hoverinfo = 'label+percent', textinfo = 'percent', textfont_size = 15)
fig.update_layout(title = 'Google Apps Content Rating', title_x =0.5)
fig.show()

Sunburst

Purpose: Displays quantitative representation of many categorical variables by size distribution.

Question: Describe the spread among age category, sex and survived people with respect to fare in titanic?

In [34]:
titanic['survived_or_not'] = np.where(titanic['Survived']==1, 'Survived', np.where(titanic['Survived']==0, 'Died', 'null'))
                                                                                   
sun_df = titanic[['Sex', 'survived_or_not', 'Embarked',
                  'age_category', 'Cabin', 'Fare']].groupby(['Sex', 'survived_or_not',                                  'age_category', 'Cabin', 'Embarked']).agg('sum').reset_index()                                                                                
In [ ]:
 
In [35]:
sun_df.head(3)
Out[35]:
Sex survived_or_not age_category Cabin Embarked Fare
0 female Died 19-30 C22 C26 S 151.5500
1 female Died 19-30 G6 S 10.4625
2 female Died 31-50 C49 C 28.7125
In [36]:
fig = px.sunburst(sun_df, path = ['Sex', 'survived_or_not', 'age_category'], values = 'Fare')

fig.update_layout(title = 'Titanic distribution by Sex, Survived, Age Category', title_x = 0.5)
fig.show()

Sunburst Gradient

Purpose: Displays quantitative representation of many categorical variables by size distribution with color gradient

Question: Describe the spread among age category, sex and survived people with respect to fare highlighting the most occured scenario with color gradient in Titanic?

In [37]:
fig = px.sunburst(sun_df, path=['Sex','survived_or_not','age_category'], values='Fare',
                  color=sun_df['Fare'],
                  color_continuous_scale='orrd') 

fig.update_layout(title="Titanic distribution by Sex, Survived, Age Category",title_x=0.5)

fig.show()

Sunburst Color

Purpose: Displays quantitative representaion of many categorical variables by size distribution with discrete color

Question: Describe the spread among age category, sex and survived people with respect to fare and highlihgting survival in discrete color?

In [38]:
fig = px.sunburst(sun_df, path = ['Sex', 'survived_or_not', 'age_category'], values = 'Fare', 
                 color = 'survived_or_not', color_discrete_map = {'(?)':'black', 'Died' :'red', 'Survived' :'darkblue'})

fig.update_layout(title = 'Titanic distribution by Sex, Survived, Age Category', title_x = 0.5)

fig.show()

Basic Time Series

Purpose: Relationship between variables with respective to time

Question: How many COVID deaths were observed over time in Australia?

In [39]:
covid['ObservationDate'] = pd.to_datetime(covid['ObservationDate'])

covid_aus = covid[covid['Country/Region'] == 'Australia'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()
In [40]:
fig = go.Figure(data = go.Scatter(x=covid_aus['ObservationDate'], 
                                 y= covid_aus['Deaths'],
                                 mode = 'lines',
                                 marker_color = 'violet'))

fig.update_layout(title = 'Australia Covid cases over time', xaxis_title = 'Date', yaxis_title = 'Number of cases')

fig.show()

Time Series - Range Slider

Purpose: Relationship between variables with respective to time with range slider

Question: How many COVID deaths were observed over time with range slider in France?

In [41]:
covid_fra = covid[covid['Country/Region']=='France'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()

covid_fra[:3]
Out[41]:
ObservationDate Deaths
0 2020-01-24 0.0
1 2020-01-25 0.0
2 2020-01-26 0.0
In [42]:
fig = go.Figure(data = go.Scatter(x= covid_fra['ObservationDate'], 
                                 y = covid_fra['Deaths'],
                                 mode = 'lines',
                                 marker_color = 'darkblue'))

fig.update_xaxes(rangeslider_visible = True)

fig.update_layout(title = 'France COVID cases over time', xaxis_title = 'Date', yaxis_title = 'Number of cases')

fig.show()

Time Series - Custom Date Range

Purpose: Relationship between variables with respective to time with custom date range

Question: How many COVID deaths were observed between March and July in Mexico?

In [43]:
covid_mex = covid[covid['Country/Region']=='Mexico'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()
In [44]:
fig = go.Figure(data =  go.Scatter(x=covid_mex['ObservationDate'], 
                                  y = covid_mex['Deaths'],
                                  mode ='lines',
                                  marker_color ='darkred'))

fig.update_layout(title = 'Mexico COVID cases over time', xaxis_title = 'Date',  yaxis_title ='Number of cases', xaxis_range = ['2020-03-01', '2020-06-31'])
fig.show()

Gantt Chart

Purpose: Display time range for labels

Question: How long did Top 10 ranked kagglers take to become Grandmaster?

In [45]:
# Filtered top 10 ranked competition GM 

top_rank = user_achieve[(user_achieve['AchievementType'] == 'Competitions')&(user_achieve['CurrentRanking'] <= 10)]

user_final = user[user['Id'].isin(list(top_rank['UserId']))]
user_final = user_final.rename(columns = {'Id':'UserId'})

user_df = pd.merge(user_final, top_rank, on ='UserId')
user_df['RegisterDate'] = pd.to_datetime(user_df['RegisterDate'])
user_df['TierAchievementDate']= pd.to_datetime(user_df['TierAchievementDate'])
user_df['diff_days'] = (pd.to_datetime(user_df['TierAchievementDate']) - pd.to_datetime(user_df['RegisterDate'])).dt.days

user_df = user_df[['CurrentRanking', 'UserName', 'DisplayName', 'RegisterDate', 'TierAchievementDate', 'diff_days']]
In [46]:
user_df.head(2)
Out[46]:
CurrentRanking UserName DisplayName RegisterDate TierAchievementDate diff_days
0 4.0 philippsinger Psi 2012-03-29 2020-06-17 3002
1 5.0 titericz Giba 2012-08-23 2016-07-15 1422
In [47]:
# Need to format dataframe with below names:
# Task (Label Name)
# Start(Start Date)
# Finish (End Date)
#Complete (Continuous variable)

gantt_df = pd.DataFrame({'Task':user_df['DisplayName'], 'Start':user_df['RegisterDate'], 'Finish':user_df['TierAchievementDate'], 'Complete':user_df['diff_days']})

gantt_df
Out[47]:
Task Start Finish Complete
0 Psi 2012-03-29 2020-06-17 3002
1 Giba 2012-08-23 2016-07-15 1422
2 CPMP 2012-12-16 2018-08-14 2067
3 dott 2013-02-14 2019-09-11 2400
4 Μαριος Μιχαηλιδης KazAnova 2013-06-24 2016-07-15 1117
5 Russ Wolfinger 2015-07-30 2017-07-09 710
6 Guanshuo Xu 2015-12-02 2018-12-05 1099
7 bestfitting 2016-09-07 2017-07-24 320
8 NQ 2017-09-28 2020-07-01 1007
9 Dieter 2017-11-17 2020-03-17 851
In [48]:
fig = ff.create_gantt(gantt_df, 
                     show_colorbar = False,
                     showgrid_x = True,
                     showgrid_y = True
                     )

fig.update_layout(title = 'Top 10 Ranked Kagglers Duration to Become Grandmaster')
fig.show()

Gantt Chart with gradient

Purpose: Display time range for labels with gradient.

Question: How long did Top 10 ranked kagglers take to become Grandmaster? Also differentiate with the Kagglers who tool most time to less time.

In [49]:
# Scaling the difference days ('Complete')

scaler = MinMaxScaler()
gantt_df_grad = gantt_df.sort_values('Complete', ascending = False)
gantt_df_grad[['Complete']] = scaler.fit_transform(gantt_df_grad[['Complete']])*100

gantt_df_grad
Out[49]:
Task Start Finish Complete
0 Psi 2012-03-29 2020-06-17 100.000000
3 dott 2013-02-14 2019-09-11 77.554064
2 CPMP 2012-12-16 2018-08-14 65.137957
1 Giba 2012-08-23 2016-07-15 41.088740
4 Μαριος Μιχαηλιδης KazAnova 2013-06-24 2016-07-15 29.716629
6 Guanshuo Xu 2015-12-02 2018-12-05 29.045488
8 NQ 2017-09-28 2020-07-01 25.615213
9 Dieter 2017-11-17 2020-03-17 19.798658
5 Russ Wolfinger 2015-07-30 2017-07-09 14.541387
7 bestfitting 2016-09-07 2017-07-24 0.000000
In [50]:
fig = ff.create_gantt(gantt_df_grad, colors = 'Blackbody', index_col = 'Complete',
                     show_colorbar = True, bar_width = 0.2,
                      showgrid_x = True, showgrid_y = True)

fig.update_layout(title = 'Top 10 Ranked Kagglers Duration to Become Grandmaster with time gradient')

Basic Box Plot

Purpose: Display distribution of a continuous variable.

Question: How are the score spread for different universities in Germany?

In [51]:
germany_score = world[world['country'] == 'Germany']['score']
germany_score[:2]
Out[51]:
81    45.33
82    45.21
Name: score, dtype: float64
In [52]:
fig = go.Figure(go.Box(y=germany_score, name ='Germany Score'))
fig.update_layout(title = 'Distribution of Germany University Scores')
fig.show()

Grouped Box Plot

Purpose: Display distribution of a continuous variable for two or more groups

Question: How is the score spread for universities in developing countries - India & Brazil ?

In [53]:
score_brazil = world[world['country'] == 'Brazil']['score']
score_india = world[world['country'] == 'India']['score']
In [54]:
fig = go.Figure()

fig.add_trace(go.Box(y=score_india,
                    marker_color = 'blue',
                    name = 'India Score'))

fig.add_trace(go.Box(y=score_brazil,
                    marker_color = 'red',
                     name = 'Brazil Score'))

fig.update_layout(title = 'Distribution of University Scores for Developing Countries - India & Brazil')

fig.show()

Box Plot Mean & SD

Purpose: Display distribution of a continuous variable for two or more groups with Mean and Standard Deviation.

Question: How is the rating distribution for playstore app categories- Map & Lifestyle? Highlight with mean and standard Deviation

In [55]:
rating_maps = google[google['Category'] == 'MAPS_AND_NAVIGATION']['Rating']
rating_life = google[google['Category'] == 'LIFESTYLE']['Rating']
In [56]:
fig = go.Figure()

fig.add_trace(go.Box(y=rating_maps,
                    boxmean=True,   # only mean visible on plot
                    marker_color='green',
                    name='Maps Apps Rating'))

fig.add_trace(go.Box(y =rating_life, 
                    boxmean = 'sd', # mean and SD visible on plot
                    marker_color = 'darkorchid',
                     name = 'Lifestyle Apps Rating'
                    ))

fig.update_layout(title = 'Distribution of Google Playstore App categories - Maps &Lifestyle')

fig.show()

Box Plot Styling

Purpose: Display distribution of a continuous variable for two or more groups with all different boxplot visualization

Question: How is the rating distribution for 4 playstore app categories? Highlight with possible boxpoints for each plot

In [57]:
rating_maps = google[google['Category']=='MAPS_AND_NAVIGATION']['Rating']
rating_life = google[google['Category'] =='LIFESTYLE']['Rating']
rating_tool = google[google['Category']=='TOOLS']['Rating']
rating_business = google[google['Category']=='BUSINESS']['Rating']
In [61]:
fig = go.Figure()

fig.add_trace(go.Box(y=rating_maps, jitter =0.3,
                     pointpos = -1.8, boxpoints ='all',
                    marker_color = 'green',
                     name ='Maps Apps Rating - All points'))

fig.add_trace(go.Box(y=rating_life, boxpoints = False, 
                     marker_color = 'darkorchid',
                    name = 'Lifestyle Apps Rating - Only Whiskers'))

fig.add_trace(go.Box(y=rating_tool, boxpoints = 'suspectedoutliers',
                    marker = dict(
                    color = 'black',
                    outliercolor = 'black'),
                    marker_color = 'magenta',
                    name = 'Tools Apps Rating - Suspected Outliers'))

fig.add_trace(go.Box(y=rating_business, boxpoints = 'outliers',
                    marker_color = 'chocolate',
                    name = 'Business Apps Rating - Outliers & Whiskers'))


fig.update_layout(title = 'Distribution of Google Playstore App categories - Maps, Lifestyle, Tools, Business')

fig.show()

Basic Histogram

Purpose: Display distribution of continuous variable

Question: What is the salary distribution of Computer management graduates?

In [65]:
campus_computer = campus[campus['degree_t']=='Comm&Mgmt'].dropna()['salary']
In [67]:
fig = go.Figure(data = [go.Histogram(x=campus_computer,
                                    marker_color = 'green',
                                    xbins=dict(
                                    start = 200000,
                                    end = 1000000,
                                    size = 10000
                                    ))])
fig.show()

Histogram Normalization

Purpose: Display distribution of a continuous variable.

Question: What is the salary distribution of Science & Technology graduates in a normalized manner?

In [75]:
campus_science = campus[campus['degree_t']=='Sci&Tech']['salary']
campus_science.head(1)
Out[75]:
0    270000.0
Name: salary, dtype: float64
In [76]:
fig = go.Figure(data = [go.Histogram(x=campus_science, histnorm='probability',
                                    marker_color = 'orange')]) #To get horizontal plot, change axis - y=campus_computer 

fig.update_layout(title = 'Distribution of Salary for Science Graduates', 
                  xaxis_title = 'Salary', yaxis_title ='Counts')

fig.show()

Histogram Overlaid

Purpose: Display distribution of a continuous variable for different groups

Question: What is the percentage distribution of Computer and Science graduates in a overlaid manner?

In [85]:
per_com = campus[campus['degree_t'] == 'Comm&Mgmt']['degree_p']
per_sci = campus[campus['degree_t'] =='Sci&Tech']['degree_p']
In [87]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=per_com, marker_color = 'green', name = 'Computer Graduates'))
fig.add_trace(go.Histogram(x=per_sci, marker_color ='orange', name = 'Science Graduates'))

# Overlay both histograms

fig.update_layout(barmode = 'overlay')

# Reduce opacity to see both histograms 

fig.update_traces(opacity = 0.75)
fig.update_layout(title = 'Distribution of Percentage for Computer &Sciecen Graduates', xaxis_title = 'Percentage', yaxis_title ='Counts')

fig.show()

Histogram Stack

Purpose: Display distribution of a continuous variable for different groups

Question: What is the percentage distribution of Computer and Science graduates in a stack manner?

In [90]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=per_com, marker_color ='green', name='Computer Graduates'))
fig.add_trace(go.Histogram(x=per_sci, marker_color ='orange', name ='Science Graduates'))

# Stack both histograms 

fig.update_layout(barmode = 'stack')

# Reduce opacity to see both histograms 

fig.update_traces(opacity = 0.75)
fig.update_layout(title = 'Distribution of Percentage for Computer & Science Graduates', xaxis_title = 'Percentage', yaxis_title = 'Counts')

fig.show()

Distplot

Purpose: Display distribution of a continuous variable.

Question: What is the price distribution for house with 4 rated condition?

In [91]:
class_1 = house[house['OverallCond']==4]['SalePrice']
class_2 = house[house['OverallCond']==5]['SalePrice']
class_3 = house[house['OverallCond']==6]['SalePrice']
In [97]:
hist_data = [class_1]
group_labels = ['Price Distribution for 4 rated condition houses']
colors = ['blue']
fig = ff.create_distplot(hist_data, group_labels, colors = colors, bin_size = [10000])

fig.show()

Multiple Distplot

Purpose: Display distribution of a continuous variable for multiple categories

Question: What is the price distribution for houses with 4,5 & 6 rated condition?

In [99]:
hist_data = [class_1, class_2, class_3] # Added more distplot 
group_labels = ['Price Distribution for 4 rated condition houses', 'Price Distribution for 5 rated condition houses', 'Price Distribution for 6 rated condition houses']
colors = ['blue', 'green', 'orange']

fig = ff.create_distplot(hist_data, group_labels, colors = colors, bin_size = [10000, 10000, 10000])

fig.show()

Distplot Curve

Purpose: Display distribution of a continuous variable for multiple categories with hist curve instead of bar.

Question: What is the price distribution for houses with 4,5 & 6 rated condition?

In [101]:
hist_data = [class_1, class_2, class_3] # Added more distplot
group_labels = ['Price Distribution for 4 rated condition house', 'Price Distribution for 5 rated condition houses', 'Price Distribution for 6 rated condition houses']
colors = ['blue', 'green', 'orange']

fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors = colors, bin_size = [10000, 10000, 10000])

fig.show()

Symmetric Error Bars

Purpose: Display the variability of data and used on graphs to indicate the error

Question: What is the variability or SD of house prices? Common values between high and low interval

In [117]:
fig = go.Figure(data = go.Scatter(x = lot_area['LotArea'], y = lot_area['SalePrice'], mode = 'lines'))

fig.show()
In [106]:
fig = go.Figure(data =go.Scatter(
    x=lot_area['LotArea'],
    y = lot_area['SalePrice'],
    error_y = dict(
        type = 'data',
        color = 'red',
        array = lot_area['SalePrice']/10,
        visible = True)
))

fig.update_layout(title = 'Sales Price Vs Area - Symmetric Error Bars', xaxis_title = 'Area', yaxis_title = 'Price')

fig.show()

Asymmetric Error Bars

Purpose: Display the variability of data and used on graphs to indicate the error.

Question: What is the variability or SD of house prices? Different values between high and low interval.

In [107]:
fig = go.Figure(data = go.Scatter(
                x = lot_area['LotArea'],
                y = lot_area['SalePrice'],
                error_y = dict(
                        type = 'data',
                        color = 'red',
                symmetric = False,
                array = lot_area['SalePrice']/100 - 500,
                arrayminus = lot_area['SalePrice']/100 + 10000)
                ))

fig.update_layout(title = 'Sales Price vs Area - Asymmetric Error Bars', xaxis_title ='Area', yaxis_title ='Price')

fig.show()

2D Hist

Purpose: Display the density of two continuous variable

Question: How much dense would be the relationship between college test percentate for interview and degree percentage?

In [119]:
fig = go.Figure(go.Histogram2d(
                x = campus['etest_p'],
                y = campus['degree_p']
))

fig.update_layout(title = 'Density of Interview Test &Degree Percent Age',
                  xaxis_title = 'Test Percentage', yaxis_title = 'Degree Percentage')

fig.show()

2D Hist Bin

Purpose: Display the density of two continuous variable with custom bin size

Question: How much dense would be the relationship between college test percentage for interview and degree percentage with custom bin size of 20?

In [120]:
fig = go.Figure(go.Histogram2d(
                x = campus['etest_p'],
                y = campus['degree_p'],
                coloraxis = 'coloraxis',
                ybins = {'start':30, 'size' : 20} # 20 bin size on yaxis
))

fig.update_layout(title = 'Density of Interview Test & Degree Percentage with bin size 20',
                 xaxis_title = 'Test Percentage', yaxis_title = 'Degree Percentage')

fig.show()

2D Facet

Purpose: Display the density of two continuous variable with facet of many categories.

Question: How much dense would be the relationship between Age and Fare price in Titanic? Facet by Sex and Survived

In [125]:
fig = px.density_heatmap(titanic, x = 'Age', y ='Fare', facet_row = 'Survived', facet_col = 'Sex')

fig.update_layout(title = 'Density heatmap of Age vs Fare with Survived and Sex')

fig.show()

Density Contour

Purpose: Display the contour lines of 2D numerical array z, i.e interpolated lines of isovalues of z

Question: How much dense would be the relationship between LotFontage and LotArea with interpolation of SalesPrice?

In [126]:
cond_10 = house[house['OverallQual']==10]
In [129]:
fig = go.Figure(go.Contour(
                x=cond_10['LotFrontage'],
                y=cond_10['LotArea'], 
                z=cond_10['SalePrice'],
                colorscale ='Electric'
))

fig.update_layout(title = 'Density Contour of house price based on Area and Fontage')
fig.show()

Density Contour Colorbar

Purpose: Display the contour lines of a 2D numerical array z, i.e interpolated lines of isovalues of z

Question: How much dense would be the relationship between LotFrontage and LotArea with interpolation of SalesPrice? Modify ColorBar

In [131]:
fig = go.Figure(data = 
                   go.Contour(
                               x = cond_10['LotFrontage'],
                               y = cond_10['LotArea'],
                               z = cond_10['SalePrice'],
                               colorscale = 'gnbu',
                               colorbar = dict(
                               title = 'House Price',
                               titleside = 'right',\
                               titlefont = dict(
                                                   size = 14, 
                                                   family = 'Arial, sans-serif')
                               )))
fig.update_layout(title = 'Density Contour of house price based on Area and Frontage')

fig.show()

2D Histogram Contour Subplot

Purpose: Display the contour and histogram of two continuous values

Question: How much dense would be the relationship between Price and LotArea? Showcase density and histogram of both values

In [132]:
cond_8 = house[house['OverallQual']==8]

x = cond_8['LotArea']
y = cond_8['SalePrice']

fig = go.Figure()
fig.add_trace(go.Histogram2dContour(
        x = x,
        y = y,
        colorscale = 'gray',
        reversescale = True,
        xaxis = 'x',
        yaxis = 'y'
    ))
fig.add_trace(go.Scatter(
        x = x,
        y = y,
        xaxis = 'x',
        yaxis = 'y',
        mode = 'markers',
        marker = dict(
            color = "red", #'rgba(0,0,0,0.3)',
            size = 3
        )
    ))
fig.add_trace(go.Histogram(
        y = y,
        xaxis = 'x2',
        marker = dict(
            color = "blue", #'rgba(0,0,0,1)'
        )
    ))
fig.add_trace(go.Histogram(
        x = x,
        yaxis = 'y2',
        marker = dict(
            color = "blue",# 'rgba(0,0,0,1)'
        )
    ))

fig.update_layout(
    autosize = False,
    xaxis = dict(
        zeroline = False,
       domain = [0,0.85],
        showgrid = False
    ),
    yaxis = dict(
        zeroline = False,
       domain = [0,0.85],
        showgrid = False
    ),
    xaxis2 = dict(
        zeroline = False,
       domain = [0.85,1],
        showgrid = False
    ),
    yaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    height = 600,
    width = 600,
    bargap = 0,
    hovermode = 'closest',
    showlegend = False,
    title_text="Density Contour of Price and Area for Condition 8 houses",title_x=0.5
)

fig.show()

Basic Violin

Purpose: Display the distribution of a continuous variable

Question: How much spread does the Indian university scores have?

In [133]:
ind_score = world[world['country']=='India']
In [134]:
fig = go.Figure(data =go.Violin(y=ind_score['score'],
                               marker_color = 'blue',
                               x0 = 'India score'))

fig.update_layout(title = 'Distribution of India Universities score')
fig.show()

Violin Boxplot

Purpose: Display the distribution of a continuous variable with violin and boxplot

Question: How much spread does the Portugal university scores have?

In [135]:
aus_score = world[world['country']=='Portugal']
In [137]:
fig = go.Figure(data = go.Violin(y=aus_score['score'], 
                                 box_visible=True, line_color = 'black',
                                meanline_visible = True, fillcolor = 'lightseagreen',
                                opacity = 0.6, x0 ='Portugal score'))

fig.update_layout(yaxis_zeroline=False, title = 'Distribution of Portugal Universities score_india')

fig.show()

RidgeLine Plot

Purpose: Display the distribution of a multiple continuous variable

Question: How much spread does universities scores from top countries have?

In [158]:
temp_list = []
names = list(world['country'].value_counts()[:10].to_frame().reset_index()['index'])

for i in names:
    temp_df=world[world['country']==i]['score']
    temp_list.append(temp_df)
    
final_arr = np.array(temp_list)
In [164]:
colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', 10, colortype = 'rgb')

fig = go.Figure()
for data_line, color, n in zip(final_arr, colors, names):
    fig.add_trace(go.Violin(x=data_line, line_color =color, name = n))
    
fig.update_traces(orientation='h', side = 'positive', width =2, points = False)
fig.update_layout(title = 'Distribution of top countries Scores', xaxis_showgrid = False, yaxis_zeroline=True, height = 800)

fig.show()

Parallel Categories

Purpose: Display more than one categorical variables distribution in a parallelized view

Question: How deep are the connections linked between survival, age category and sex?

In [166]:
fig = px.parallel_categories(titanic, dimensions = ['survived_or_not', 'age_category', 'Sex'])

fig.update_layout(title = 'Titanic Parallel Categories Diagram')
fig.show()

Parallel Categories Styled

Purpose: Display more than one categorical variables distribution in a parallelized view

Question: How deep are the connections linked between survival, pclass and sex over age?

In [175]:
fig = px.parallel_categories(titanic, dimensions = ['survived_or_not', 'Pclass', 'Sex'],
                            color ='Age', color_continuous_scale= px.colors.sequential.Aggrnyl,
                            labels = {'survived_or_not':'Survived', 'Pclass':'Class'})

fig.update_layout(title ='Titanic Parallel Categories Diagram')
fig.show()

Tables

Purpose: Display values in table format

In [177]:
tab_netflix = netflix[:5][['title', 'release_year', 'duration', 'country']]
In [180]:
fig = go.Figure(data = [go.Table(header = dict(values = ['Title', 'Release Year', 'Duration',' Country']),
                                cells = dict(values = [tab_netflix['title'],tab_netflix['release_year'], tab_netflix['duration'], tab_netflix['country']
                                                      ]))])

fig.show()

Tables - Cell Color

Purpose: Display values in table format with each column and header with different colors

In [184]:
colors = ['lightblue', 'lightpink', 'lightgreen', 'yellow']

fig = go.Figure(data = [go.Table(header = dict(values = ['Title', 'Release Year', 'Duration', 'Country'], line_color = 'white', fill_color = 'gray',
           align='center', font= dict(color ='white', size =12)),
                                 
                                cells = dict(values = [tab_netflix['title'], tab_netflix['release_year'], tab_netflix['duration'], tab_netflix['country']], line_color = colors, fill_color = colors, align = 'center', font=dict(color = 'black', size =11))
                                )])

fig.show()

Figure Factory Tables

Purpose: Display values in table format (Figure Factory format)

In [186]:
fig = ff.create_table(tab_netflix)
fig.show()

Figure Factory Tables - Colors & Size

Purpose: Display values in figure factory table format with custom colors and sizes

In [187]:
colorscale=[[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']] # colorscale
fig =  ff.create_table(tab_netflix,colorscale=colorscale)
fig.layout.width=1250  # Adjust width layout

# Make text size larger
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font.size = 14
    
fig.show()

Subplot

Purpose: Display more than one plot and arrange by row and column

Question: Plot Covid death time series for Canada, Russia, UK, Australia

In [190]:
covid_can=covid[covid['Country/Region']=="Canada"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_rus=covid[covid['Country/Region']=="Russia"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_uk=covid[covid['Country/Region']=="UK"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_aus=covid[covid['Country/Region']=="Australia"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
In [192]:
fig = make_subplots(rows=2, cols=2,
                   subplot_titles=("Canada Covid Deaths", "Russia Covid Deaths", "UK Covid Deaths", "Australia Covid Deaths"))  # Subplot titles

fig.add_trace(
    go.Scatter(x=covid_can['ObservationDate'], y=covid_can['Deaths'],name="Canada"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=covid_rus['ObservationDate'], y=covid_rus['Deaths'],name="Russia"),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=covid_uk['ObservationDate'], y=covid_uk['Deaths'],name="UK"),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=covid_aus['ObservationDate'], y=covid_aus['Deaths'],name="Australia"),
    row=2, col=2
)

fig.update_layout(height=600, width=800, title_text="Countries Covid Deaths-Side By Side Subplots")
fig.show()

Subplot - Shared Axes

Purpose: Display more than one plot and arrange by row and column with common x axis

Question: Plot Covid death time series for Canada, Russia, UK, Australia with common x axis?

In [197]:
fig = make_subplots(rows=4, cols=1,
                   shared_xaxes=True, #change this line to shared_yaxes=True for shared y axes
                    vertical_spacing=0.02)  

fig.add_trace(
    go.Scatter(x=covid_can['ObservationDate'], y=covid_can['Deaths'],name="Canada"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=covid_rus['ObservationDate'], y=covid_rus['Deaths'],name="Russia"),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=covid_uk['ObservationDate'], y=covid_uk['Deaths'],name="UK"),
    row=3, col=1
)

fig.add_trace(
    go.Scatter(x=covid_aus['ObservationDate'], y=covid_aus['Deaths'],name="Australia"),
    row=4, col=1
)

fig.update_layout(height=600, width=800, title_text="Countries Covid Deaths-Shared Axis Subplots")
fig.show()

Subplot - Shared Colorscale

Purpose: Display more than one plot and arrange by row and column

Question: Plot Netflix Show Counts for US, India, Japan, Canada

In [198]:
net_us=netflix[netflix['country']=='United States']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})

net_ind=netflix[netflix['country']=='India']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})

net_jap=netflix[netflix['country']=='Japan']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})

net_can=netflix[netflix['country']=='Canada']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
In [200]:
fig = make_subplots(rows=1, cols=4,subplot_titles=("US Netflix Shows", "India Netflix Shows", "Japan Netflix Shows", "Canada Netflix Shows"),
                   shared_yaxes=True)  

fig.add_trace(
    go.Bar(x=net_us['release_year'], y=net_us['count'],name="US", marker=dict(color=net_us['count'], coloraxis="coloraxis1")),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=net_ind['release_year'], y=net_ind['count'],name="India", marker=dict(color=net_ind['count'], coloraxis="coloraxis1")),
    row=1, col=2
)
fig.add_trace(
    go.Bar(x=net_jap['release_year'], y=net_jap['count'],name="Japan",marker=dict(color=net_jap['count'], coloraxis="coloraxis1")),
    row=1, col=3
)
fig.add_trace(
    go.Bar(x=net_can['release_year'], y=net_can['count'],name="Canada",marker=dict(color=net_can['count'], coloraxis="coloraxis1")),
    row=1, col=4
)

fig.update_layout(coloraxis=dict(colorscale='RdBu'), title_text="Countries Netflix show counts -Shared Colorscale Subplots",showlegend=False)
fig.show()

Mixed Subplot

Purpose: Display more than one plot of different types and arrange by row and column

Question: Plot Netflix show counts (Bar chart) & Playstore Apps (Pie chart) together?

In [201]:
categ_apps = google['Category'].value_counts()[:5].to_frame().reset_index().rename(columns = {'index' : 'Category', 'Category' : 'count'})
In [206]:
fig = make_subplots(
    rows=1, cols=2,subplot_titles=["Indian Netflix shows by year","Playstore Apps Categories"],
    specs=[[{"type": "bar"}, {"type": "pie"}]])


fig.add_trace(go.Bar(x=net_ind['release_year'], y=net_ind['count'],name="India"),row=1, col=1)

fig.add_trace(go.Pie(labels=categ_apps['Category'], values=categ_apps['count']),row=1, col=2)

fig.update_layout(title_text="Multiple Subplots",showlegend=False)

fig.show()